import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf

import plotly
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

#print(__version__) # requires version >= 1.9.0

# For Notebooks
init_notebook_mode(connected=True)

# For offline use
cf.go_offline()

%matplotlib inline

school_frame = pd.read_csv('2016 School Explorer.csv')
shsat_frame = pd.read_csv("D5 SHSAT Registrations and Testers.csv")

pd.set_option('display.max_columns', None)  
school_frame.head()

school_frame.shape

(1272, 161)

school_frame.columns

Index(['Adjusted Grade', 'New?', 'Other Location Code in LCGMS', 'School Name',
       'SED Code', 'Location Code', 'District', 'Latitude', 'Longitude',
       'Address (Full)',
       ...
       'Grade 8 Math - All Students Tested', 'Grade 8 Math 4s - All Students',
       'Grade 8 Math 4s - American Indian or Alaska Native',
       'Grade 8 Math 4s - Black or African American',
       'Grade 8 Math 4s - Hispanic or Latino',
       'Grade 8 Math 4s - Asian or Pacific Islander',
       'Grade 8 Math 4s - White', 'Grade 8 Math 4s - Multiracial',
       'Grade 8 Math 4s - Limited English Proficient',
       'Grade 8 Math 4s - Economically Disadvantaged'],
      dtype='object', length=161)

# Function converting string % values to int
def percent_to_int(df_in):
    for col in df_in.columns.values:
        if col.startswith("Percent") or col.endswith("%") or col.endswith("Rate"):
            df_in[col] = df_in[col].astype(np.object).str.replace('%', '').astype(float)
    return df_in

school_frame = percent_to_int(school_frame)

#Filtering out non Middle Schools
school_frame_raw = school_frame[school_frame['Grade High'] != '0K'] 
#school_frame_relevant_grade = school_frame[school_frame['Grade High'].astype(int) > 5]

#Cleaning School Income Estimate
school_frame_raw['School Income Estimate'] = school_frame['School Income Estimate'].astype(np.object).str.replace('$', '').str.replace(',', '').str.replace('.', '').astype(float)



#Removing >95% null columns
high_nan_columns = school_frame_raw.columns[school_frame_raw.isnull().mean()>0.95]
#school_frame = school_frame_relevant_grade.drop(high_nan_columns, axis=1)
#print(list(high_bnan_columns))

#school_frame_raw contains all grades while school_frame contains SHSAT eligible middle schools
school_frame_relevant_grades = school_frame_raw[school_frame_raw['Grade High'].astype(int) > 5]
school_frame = school_frame_relevant_grades.drop(high_nan_columns, axis=1)

#Filling null values which prevent calculations
school_frame['School Income Estimate'] = school_frame['School Income Estimate'].fillna(0)
school_frame['Economic Need Index'] = school_frame['Economic Need Index'].fillna(0)

print("We have %d relevant schools and %d fields describing the school/ students"%(school_frame.shape))

We have 626 relevant schools and 158 fields describing the school/ students

C:\Users\Omar\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

#Distributions of schools based on city
schools_by_city = school_frame['City'].value_counts()
schools_by_city.iplot(kind='bar',yTitle='Number of SHSAT Eligible Schools', title='SHSAT Eligible Schools by City', filename='cufflinks/categorical-bar-chart')

#Analysis of Chronically Absent Students

school_frame = school_frame.dropna(subset = ['Percent of Students Chronically Absent'])
temp = sns.distplot(school_frame[['Percent of Students Chronically Absent']].values, kde=False)
temp = plt.title('Distribution of Schools Based on Chronically Absent Students')
temp = plt.xlabel("Percent of students")
temp = plt.ylabel("Count")

chronically_absent_foi = ['School Name', 'Percent of Students Chronically Absent','Percent Black / Hispanic','Economic Need Index']

df_schools_ca = school_frame[school_frame['Percent of Students Chronically Absent'] > 40]
df_schools_nca = school_frame[school_frame['Percent of Students Chronically Absent'] < 5]

df_schools_ca[chronically_absent_foi].sort_values('Percent of Students Chronically Absent', ascending = False)

plt.figure(figsize = [16,7])
plt.suptitle('Statistics for Schools with High Chronically Absent Percentage', fontsize=15)
plt.subplot(1,2,1)
temp = sns.distplot(df_schools_ca[['Economic Need Index']].values, kde=False)
temp = plt.xlabel("Economic Need Index", fontsize=15)
temp = plt.ylabel("School count", fontsize=15)
plt.subplot(1,2,2)
temp = sns.distplot(df_schools_ca[['Percent Black / Hispanic']].values, kde=False)
temp = plt.xlabel("Percent Black / Hispanic", fontsize=15)
temp = plt.ylabel("School count", fontsize=15)

data = [
    {
        'x': school_frame["Longitude"],
        'y': school_frame["Latitude"],
        'text': school_frame["School Name"],
        'mode': 'markers',
        'marker': {
            'color': school_frame["Economic Need Index"],
            'size': school_frame["School Income Estimate"]/4500,
            'showscale': True,
            'colorscale':'Portland'
        }
    }
]

layout= go.Layout(
    title= 'New York School Population (Economic Need Index)',
    xaxis= dict(
        title= 'Longitude'
    ),
    yaxis=dict(
        title='Latitude'
    ))
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels.html')

#Black Population
data = [
    {
        'x': school_frame["Longitude"],
        'y': school_frame["Latitude"],
        'text': school_frame["School Name"],
        'mode': 'markers',
        'marker': {
            'color': school_frame["Percent Black"],
            'size': school_frame["School Income Estimate"]/4500,
            'showscale': True,
            'colorscale':'Portland'
        }
    }
]

layout= go.Layout(
    title= 'New York Black Student Ratio Of School',
    xaxis= dict(
        title= 'Longitude'
    ),
    yaxis=dict(
        title='Latitude'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels')

#Hispanic Population
data = [
    {
        'x': school_frame["Longitude"],
        'y': school_frame["Latitude"],
        'text': school_frame["School Name"],
        'mode': 'markers',
        'marker': {
            'color': school_frame["Percent Hispanic"],
            'size': school_frame["School Income Estimate"]/4500,
            'showscale': True,
            'colorscale':'Portland'
        }
    }
]

layout= go.Layout(
    title= 'New York Hispanic Student Ratio Of School',
    xaxis= dict(
        title= 'Longitude'
    ),
    yaxis=dict(
        title='Latitude'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels')

#combination of black and latino
#bl_frame = school_frame["Percent Hispanic"] + school_frame["Percent Black"] 

#Hispanic Population
data = [
    {
        'x': school_frame["Longitude"],
        'y': school_frame["Latitude"],
        'text': school_frame["School Name"],
        'mode': 'markers',
        'marker': {
            'color': school_frame["Percent Black / Hispanic"],
            'size': school_frame["School Income Estimate"]/4500,
            'showscale': True,
            'colorscale':'Portland'
        }
    }
]

layout= go.Layout(
    title= 'New York Black/Hispanic Student Ratio Of School',
    xaxis= dict(
        title= 'Longitude'
    ),
    yaxis=dict(
        title='Latitude'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter_hover_labels')

lower_bound = school_frame['Economic Need Index'].quantile(0.75)
high_eni_schools = school_frame[school_frame['Economic Need Index'] > lower_bound]
low_eni_schools = school_frame[school_frame['Economic Need Index'] < 0.25]

high_eni_schools[['School Name', 'Economic Need Index', 'Percent Black / Hispanic']].sort_values('Economic Need Index', ascending = False).head(10)

print ("Average ELA / Math performance for the high ENI schools")
high_eni_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()

Average ELA / Math performance for the high ENI schools

Average ELA Proficiency     2.247792
Average Math Proficiency    2.261688
dtype: float64

print ("Average ELA / Math performance for the low ENI schools")
low_eni_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()

Average ELA / Math performance for the low ENI schools

Average ELA Proficiency     3.267917
Average Math Proficiency    3.422083
dtype: float64

#After school programs
#Start by looking at which schools have the lowest income estimates. 
income_estimate = school_frame[['School Name','School Income Estimate']].sort_values(by='School Income Estimate',ascending=True)

#There are several null columns here.. this makes it difficult to use this category as a method of predicting need

null_columns=income_estimate.columns[income_estimate.isnull().any()]
income_estimate[null_columns].isnull().sum()

Series([], dtype: float64)

#Lets see how many null columns in the entire school frame
#There is a considerable amount missing (31%) in the School Income Estimate, leading me to want to avoid using it
null_columns=school_frame.columns[school_frame.isnull().any()]
school_frame[null_columns].isnull().sum()

Rigorous Instruction Rating            24
Collaborative Teachers Rating          24
Supportive Environment Rating          28
Effective School Leadership Rating     22
Strong Family-Community Ties Rating    23
Trust Rating                           23
Student Achievement Rating             22
Average ELA Proficiency                 7
Average Math Proficiency                7
dtype: int64

#Lets see how many null columns in the entire school frame
#Conclusion: Doesn't seem to be any
null_columns=shsat_frame.columns[shsat_frame.isnull().any()]
shsat_frame[null_columns].isnull().sum()

Series([], dtype: float64)

column_list = school_frame.columns.values

for i,a in enumerate(column_list):
   print('%s is %d' % (a, i))

School Name is 0
SED Code is 1
Location Code is 2
District is 3
Latitude is 4
Longitude is 5
Address (Full) is 6
City is 7
Zip is 8
Grades is 9
Grade Low is 10
Grade High is 11
Community School? is 12
Economic Need Index is 13
School Income Estimate is 14
Percent ELL is 15
Percent Asian is 16
Percent Black is 17
Percent Hispanic is 18
Percent Black / Hispanic is 19
Percent White is 20
Student Attendance Rate is 21
Percent of Students Chronically Absent is 22
Rigorous Instruction % is 23
Rigorous Instruction Rating is 24
Collaborative Teachers % is 25
Collaborative Teachers Rating is 26
Supportive Environment % is 27
Supportive Environment Rating is 28
Effective School Leadership % is 29
Effective School Leadership Rating is 30
Strong Family-Community Ties % is 31
Strong Family-Community Ties Rating is 32
Trust % is 33
Trust Rating is 34
Student Achievement Rating is 35
Average ELA Proficiency is 36
Average Math Proficiency is 37
Grade 3 ELA - All Students Tested is 38
Grade 3 ELA 4s - All Students is 39
Grade 3 ELA 4s - American Indian or Alaska Native is 40
Grade 3 ELA 4s - Black or African American is 41
Grade 3 ELA 4s - Hispanic or Latino is 42
Grade 3 ELA 4s - Asian or Pacific Islander is 43
Grade 3 ELA 4s - White is 44
Grade 3 ELA 4s - Multiracial is 45
Grade 3 ELA 4s - Limited English Proficient is 46
Grade 3 ELA 4s - Economically Disadvantaged is 47
Grade 3 Math - All Students tested is 48
Grade 3 Math 4s - All Students is 49
Grade 3 Math 4s - American Indian or Alaska Native is 50
Grade 3 Math 4s - Black or African American is 51
Grade 3 Math 4s - Hispanic or Latino is 52
Grade 3 Math 4s - Asian or Pacific Islander is 53
Grade 3 Math 4s - White is 54
Grade 3 Math 4s - Multiracial is 55
Grade 3 Math 4s - Limited English Proficient is 56
Grade 3 Math 4s - Economically Disadvantaged is 57
Grade 4 ELA - All Students Tested is 58
Grade 4 ELA 4s - All Students is 59
Grade 4 ELA 4s - American Indian or Alaska Native is 60
Grade 4 ELA 4s - Black or African American is 61
Grade 4 ELA 4s - Hispanic or Latino is 62
Grade 4 ELA 4s - Asian or Pacific Islander is 63
Grade 4 ELA 4s - White is 64
Grade 4 ELA 4s - Multiracial is 65
Grade 4 ELA 4s - Limited English Proficient is 66
Grade 4 ELA 4s - Economically Disadvantaged is 67
Grade 4 Math - All Students Tested is 68
Grade 4 Math 4s - All Students is 69
Grade 4 Math 4s - American Indian or Alaska Native is 70
Grade 4 Math 4s - Black or African American is 71
Grade 4 Math 4s - Hispanic or Latino is 72
Grade 4 Math 4s - Asian or Pacific Islander is 73
Grade 4 Math 4s - White is 74
Grade 4 Math 4s - Multiracial is 75
Grade 4 Math 4s - Limited English Proficient is 76
Grade 4 Math 4s - Economically Disadvantaged is 77
Grade 5 ELA - All Students Tested is 78
Grade 5 ELA 4s - All Students is 79
Grade 5 ELA 4s - American Indian or Alaska Native is 80
Grade 5 ELA 4s - Black or African American is 81
Grade 5 ELA 4s - Hispanic or Latino is 82
Grade 5 ELA 4s - Asian or Pacific Islander is 83
Grade 5 ELA 4s - White is 84
Grade 5 ELA 4s - Multiracial is 85
Grade 5 ELA 4s - Limited English Proficient is 86
Grade 5 ELA 4s - Economically Disadvantaged is 87
Grade 5 Math - All Students Tested is 88
Grade 5 Math 4s - All Students is 89
Grade 5 Math 4s - American Indian or Alaska Native is 90
Grade 5 Math 4s - Black or African American is 91
Grade 5 Math 4s - Hispanic or Latino is 92
Grade 5 Math 4s - Asian or Pacific Islander is 93
Grade 5 Math 4s - White is 94
Grade 5 Math 4s - Multiracial is 95
Grade 5 Math 4s - Limited English Proficient is 96
Grade 5 Math 4s - Economically Disadvantaged is 97
Grade 6 ELA - All Students Tested is 98
Grade 6 ELA 4s - All Students is 99
Grade 6 ELA 4s - American Indian or Alaska Native is 100
Grade 6 ELA 4s - Black or African American is 101
Grade 6 ELA 4s - Hispanic or Latino is 102
Grade 6 ELA 4s - Asian or Pacific Islander is 103
Grade 6 ELA 4s - White is 104
Grade 6 ELA 4s - Multiracial is 105
Grade 6 ELA 4s - Limited English Proficient is 106
Grade 6 ELA 4s - Economically Disadvantaged is 107
Grade 6 Math - All Students Tested is 108
Grade 6 Math 4s - All Students is 109
Grade 6 Math 4s - American Indian or Alaska Native is 110
Grade 6 Math 4s - Black or African American is 111
Grade 6 Math 4s - Hispanic or Latino is 112
Grade 6 Math 4s - Asian or Pacific Islander is 113
Grade 6 Math 4s - White is 114
Grade 6 Math 4s - Multiracial is 115
Grade 6 Math 4s - Limited English Proficient is 116
Grade 6 Math 4s - Economically Disadvantaged is 117
Grade 7 ELA - All Students Tested is 118
Grade 7 ELA 4s - All Students is 119
Grade 7 ELA 4s - American Indian or Alaska Native is 120
Grade 7 ELA 4s - Black or African American is 121
Grade 7 ELA 4s - Hispanic or Latino is 122
Grade 7 ELA 4s - Asian or Pacific Islander is 123
Grade 7 ELA 4s - White is 124
Grade 7 ELA 4s - Multiracial is 125
Grade 7 ELA 4s - Limited English Proficient is 126
Grade 7 ELA 4s - Economically Disadvantaged is 127
Grade 7 Math - All Students Tested is 128
Grade 7 Math 4s - All Students is 129
Grade 7 Math 4s - American Indian or Alaska Native is 130
Grade 7 Math 4s - Black or African American is 131
Grade 7 Math 4s - Hispanic or Latino is 132
Grade 7 Math 4s - Asian or Pacific Islander is 133
Grade 7 Math 4s - White is 134
Grade 7 Math 4s - Multiracial is 135
Grade 7 Math 4s - Limited English Proficient is 136
Grade 7 Math 4s - Economically Disadvantaged is 137
Grade 8 ELA - All Students Tested is 138
Grade 8 ELA 4s - All Students is 139
Grade 8 ELA 4s - American Indian or Alaska Native is 140
Grade 8 ELA 4s - Black or African American is 141
Grade 8 ELA 4s - Hispanic or Latino is 142
Grade 8 ELA 4s - Asian or Pacific Islander is 143
Grade 8 ELA 4s - White is 144
Grade 8 ELA 4s - Multiracial is 145
Grade 8 ELA 4s - Limited English Proficient is 146
Grade 8 ELA 4s - Economically Disadvantaged is 147
Grade 8 Math - All Students Tested is 148
Grade 8 Math 4s - All Students is 149
Grade 8 Math 4s - American Indian or Alaska Native is 150
Grade 8 Math 4s - Black or African American is 151
Grade 8 Math 4s - Hispanic or Latino is 152
Grade 8 Math 4s - Asian or Pacific Islander is 153
Grade 8 Math 4s - White is 154
Grade 8 Math 4s - Multiracial is 155
Grade 8 Math 4s - Limited English Proficient is 156
Grade 8 Math 4s - Economically Disadvantaged is 157

#Correlation matrix of school frame
school_corr = school_frame.iloc[:,[13,15,16,17,18,19,20,21,22,36,37]]

corr = school_corr.corr()
corr = (corr)

fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            cmap="RdYlGn", center=0, annot=True)

#plt.savefig('img/correlation.png',bbox_inches='tight')

<matplotlib.axes._subplots.AxesSubplot at 0x224669eb940>

#Correlation matrix of school frame
school_corr = school_frame.iloc[:,[13,14,15,16,17,18,20,21,22]]

corr = school_corr.corr()
corr = (corr)

fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            cmap="RdYlGn", center=0, annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x22467279780>

#Test Prep and Tutoring

minority_frame = school_frame_raw[school_frame_raw['Percent Black / Hispanic'].astype(int)>70]
print(minority_frame.shape)

(829, 161)

def grade_minority_percent_4s (df, grade, subject):
    out_field = ('Grade %d %s Minority 4s')%(grade, subject)
    num1 = ('Grade %d %s 4s - Black or African American')%(grade, subject)
    num2 = ('Grade %d %s 4s - Hispanic or Latino')%(grade, subject)
    den = ('Grade %d %s 4s - All Students')%(grade, subject)
    df = df[df[den].astype(int)>5]
    df[out_field] = (df[num1] + df[num2])/(df[den])
    grade_minority_df = df[df[out_field]>0.7]
    print(grade, subject, df.shape, grade_minority_df.shape, (grade_minority_df[num1] + grade_minority_df[num2]).mean(), df[num1].mean(), df[num2].mean())
    return df

import itertools

grades = [5,6,7]
subjects = ['ELA', 'Math']
i=1
plt.figure(figsize=[19,10])
df_all_schools = pd.DataFrame(columns=minority_frame.columns)
for grade,subject in itertools.product(grades, subjects):
    df_schools_tmp = grade_minority_percent_4s(minority_frame, grade, subject)
    
#     df_schools_tmp[]
    fname = ('Grade %d %s Minority 4s')%(grade, subject)
    plt.subplot(3,2,i)
    df_schools_tmp[fname].hist()
    df_schools_tmp.drop([fname], axis=1, inplace=True)
    plt.title(fname)
    df_all_schools = pd.concat([df_all_schools, df_schools_tmp])
    i += 1
    
plt.savefig('img/minority4s.png',bbox_inches='tight')

5 ELA (133, 162) (80, 162) 9.5625 4.0 3.8345864661654137

C:\Users\Omar\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

C:\Users\Omar\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

5 Math (175, 162) (107, 162) 10.897196261682243 4.377142857142857 5.428571428571429
6 ELA (143, 162) (93, 162) 12.698924731182796 6.2027972027972025 4.671328671328672
6 Math (185, 162) (122, 162) 17.639344262295083 7.383783783783784 7.335135135135135
7 ELA (104, 162) (61, 162) 11.60655737704918 4.0576923076923075 5.288461538461538
7 Math (115, 162) (70, 162) 15.142857142857142 5.426086956521739 6.7043478260869565

#SHSAT Analysis
shsat_frame.head()

shsat_frame['Took Percentage'] = shsat_frame['Number of students who took the SHSAT'] / shsat_frame['Number of students who registered for the SHSAT']

shsat_sorted = shsat_frame.groupby(['DBN','School name'], as_index=False).sum()
shsat_sorted.head()

shsat_sorted = shsat_frame.groupby(['DBN','School name'], as_index=False).sum()
shsat_sorted = shsat_sorted[['DBN','School name', 
                                 'Enrollment on 10/31', 
                                 'Number of students who registered for the SHSAT', 
                                 'Number of students who took the SHSAT']].join(shsat_frame[['Year of SHST', 
                                                                                          'Grade level']], how='inner')
shsat_sorted = shsat_sorted.sort_values('Number of students who registered for the SHSAT',ascending=False)

shsat_sorted.head()

shsat_sorted['Took Percentage'] = shsat_sorted['Number of students who took the SHSAT'] / shsat_sorted['Number of students who registered for the SHSAT']

sns.set(style="whitegrid")

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(15, 10))

# Plot the total schools per city
sns.set_color_codes("pastel")
sns.barplot(x="Number of students who registered for the SHSAT", y='School name', data=shsat_sorted,
            label="# of SHSAT Registrations", color="b", ci=None)

# Plot the total community schools per city
sns.set_color_codes("muted")
sns.barplot(x="Number of students who took the SHSAT", y="School name", data=shsat_sorted,
            label="# of Students who Took SHSAT", color="b",ci=None)

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 300), ylabel="School Name", title='SHSAT School Registration Distribution',
       xlabel="# of Registrations")
sns.despine(left=True, bottom=True)

plt.savefig('img/shsatregdist.png',bbox_inches='tight')

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 9))
sns.barplot(y='School name', x='Took Percentage', data=shsat_sorted.sort_values('Took Percentage', ascending = False), ax=ax, orient='h', ci=None, color='b')

<matplotlib.axes._subplots.AxesSubplot at 0x224673569e8>

shsat_low_appearance_df = shsat_sorted[shsat_sorted['Took Percentage'] <= 0.4]
shsat_low_appearance_df.sort_values('Took Percentage')

poor_schools_shsat_performance = list(np.unique(shsat_sorted['DBN']))
# poor_schools_shsat_performance = [i.strip().lower() for i in poor_schools_shsat_performance]

total_schools = list(set(school_frame['Location Code']))
# total_schools = [i.strip().lower() for i in total_schools]

intersection = list(set(poor_schools_shsat_performance).intersection(set(total_schools)))

print('Number of Intersecting schools = {}'.format(len(intersection)))

Number of Intersecting schools = 21

percent_black_hist = []
for school in intersection:
    percent_black_hist.append(list(school_frame[school_frame['Location Code'] == school]['Percent Black / Hispanic'])[0])
  
plt.figure(figsize=(10,5))
sns.distplot(percent_black_hist, kde=False)

temp = plt.title('Percent Black / Hispanic in low SHSAT-taking schools', fontsize=15)
temp = plt.xlabel("Percent Black / Hispanic", fontsize=15)
temp = plt.ylabel("Number of Schools", fontsize=15)


plt.savefig('img/percentbhshsat.png',bbox_inches='tight')

#As seen from the histogram above, the percentage of Black and Hispanic students is high where the percent of students taking the exam after registration is low.

#After School Programs 2.0 - School Performance Index
features_list = ['Rigorous Instruction %',
'Collaborative Teachers %',
'Supportive Environment %',
'Effective School Leadership %',
'Strong Family-Community Ties %',
'Trust %']

school_frame[['School Name'] + features_list ].head()

#Looking at correlation of these features
school_frame[features_list].corr()

#From this we can see how correlated these features are to eachother

corr = school_frame[features_list].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, cmap='RdYlGn')
temp = plt.xticks(rotation=75, fontsize=15) 
temp = plt.yticks(fontsize=15) 

plt.savefig('img/schoolperformance.png',bbox_inches='tight')

#The highest correlation is between Effective School Leadership, Collaborative Teachers and, Trust 
#Principal Component Anaysis (PCA)' has been applied on the 3 features to get a single combined feature capturing the key characteristics of all those three features.

from sklearn.decomposition import PCA

correlated_features_list = ["Effective School Leadership %","Collaborative Teachers %","Trust %"]
corr_features_values = school_frame[correlated_features_list].values

pca = PCA(n_components=1)
combined_feature_value = pca.fit_transform(corr_features_values)
school_frame['PCA Combined Feature'] = combined_feature_value
#df_schools[correlated_features_list + ['PCA Combined Feature']].corr()

import sklearn

scaler = sklearn.preprocessing.MinMaxScaler()
scale_factor = 2*(school_frame['PCA Combined Feature'].corr(school_frame["Effective School Leadership %"])>0) -1 
school_frame['PCA Combined Feature'] =  scaler.fit_transform(scale_factor * school_frame['PCA Combined Feature'].values.reshape(-1,1))*100

print ("The correlation between the three correlated features and their PCA is shown below:")
school_frame[correlated_features_list + ['PCA Combined Feature']].corr()

The correlation between the three correlated features and their PCA is shown below:

#Weights to each factor for calculating School Performace Indicator:
#1.0 := Supportive Environment % 
#0.8 := Rigorous Instruction % 
#0.7 := PCA combined feature % 
#0.5 := Strong Family-Community Ties % 
#NOTE: PCA combined features incorporates these three features - Effective School Leadership %, Collaborative Teachers %, Trust %

features = ['Rigorous Instruction %','Supportive Environment %','PCA Combined Feature',
            'Strong Family-Community Ties %']
weights = [0.8, 1, 0.7, 0.5]

school_frame['SPI'] = school_frame[features].dot(weights)


print ("A glimpse of the School Performance Index (SPI) :")
school_frame[features+['SPI']].head(5)

A glimpse of the School Performance Index (SPI) :

#Separting schools into low and high performance groups

df_low_spi_schools = school_frame[school_frame['SPI'] < school_frame['SPI'].quantile(.25)]
df_high_spi_schools = school_frame[school_frame['SPI'] > school_frame['SPI'].quantile(.25)]

#fig = plot_city_hist(df_low_spi_schools, 'Distribution of low SPI schools by city')
#iplot(fig)

#We can see that there is high disparity in the ELA/Math proficiency amongst school students with high/low SPI.

print ("Average ELA / Math performance for the high SPI schools")
df_high_spi_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()

Average ELA / Math performance for the high SPI schools

Average ELA Proficiency     2.592795
Average Math Proficiency    2.703537
dtype: float64

print ("Average ELA / Math performance for the low SPI schools")
df_low_spi_schools[['Average ELA Proficiency','Average Math Proficiency']].mean()

Average ELA / Math performance for the low SPI schools

Average ELA Proficiency     2.366839
Average Math Proficiency    2.370323
dtype: float64

import base64
from IPython.display import HTML

def download_link(df, filename = "data.csv"):
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    title = "Download CSV file"
    html = '<button type="button" style="font-size: larger;  background-color: #FFFFFF; border: 0pt;"><a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a></button>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

dfn = df_low_spi_schools.drop_duplicates()
print(dfn.shape)
download_link(dfn, 'low_spi_schools.csv')

(155, 160)

dfn = df_all_schools.drop_duplicates()
print(dfn.shape)
download_link(dfn, 'top_performing_students.csv')

(355, 161)

	School Name	Percent of Students Chronically Absent	Percent Black / Hispanic	Economic Need Index
1261	SUCCESS ACADEMY CHARTER SCHOOL - BRONX 2	100.0	98.0	0.794
1260	SUCCESS ACADEMY CHARTER SCHOOL - BRONX 1	100.0	96.0	0.825
1206	SUCCESS ACADEMY CHARTER SCHOOL - UPPER WEST	100.0	48.0	0.369
1203	SUCCESS ACADEMY CHARTER SCHOOL - HARLEM 5	100.0	95.0	0.789
1198	SUCCESS ACADEMY CHARTER SCHOOL - HARLEM 4	100.0	94.0	0.711
1197	SUCCESS ACADEMY CHARTER SCHOOL - HARLEM 3	100.0	94.0	0.720
1196	SUCCESS ACADEMY CHARTER SCHOOL - HARLEM 2	100.0	97.0	0.734
1173	BROOKLYN LAB CHARTER SCHOOL	100.0	89.0	0.639
1121	SUCCESS ACADEMY CHARTER SCHOOL - BED-STUY 1	100.0	96.0	0.750
300	NEW DIRECTIONS SECONDARY SCHOOL	74.0	98.0	0.920
12	ORCHARD COLLEGIATE ACADEMY	58.0	84.0	0.832
631	P.S. 306 ETHAN ALLEN	57.0	98.0	0.875
527	DIGITAL ARTS AND CINEMA TECHNOLOGY HIGH SCHOOL	52.0	93.0	0.749
755	P.S. 165 IDA POSNER	50.0	97.0	0.815
110	P.S. 050 VITO MARCANTONIO	49.0	98.0	0.890
135	P.S. 123 MAHALIA JACKSON	49.0	93.0	0.835
874	P.S./M.S 042 R. VERNAM	48.0	95.0	0.825
757	P.S. 184 NEWPORT	48.0	98.0	0.904
432	EAST BRONX ACADEMY FOR THE FUTURE	48.0	98.0	0.853
244	M.S. 302 LUISA DESSUS CRUZ	48.0	98.0	0.906
139	EAGLE ACADEMY FOR YOUNG MEN OF HARLEM	47.0	98.0	0.747
761	P.S. 327 DR. ROSE B. ENGLISH	47.0	96.0	0.851
469	SATELLITE WEST MIDDLE SCHOOL	47.0	94.0	0.859
466	SATELLITE EAST MIDDLE SCHOOL	47.0	78.0	0.856
548	M.S. 267 MATH, SCIENCE & TECHNOLOGY	46.0	87.0	0.826
714	P.S. 288 THE SHIRLEY TANYHILL	46.0	90.0	0.912
402	LEADERS OF TOMORROW	46.0	90.0	0.771
756	P.S. 178 SAINT CLAIR MCKELWAY	46.0	95.0	0.884
243	M.S. 301 PAUL L. DUNBAR	45.0	98.0	0.823
1008	P.S. 111 JACOB BLACKWELL	45.0	89.0	0.847
294	BRONX WRITING ACADEMY	45.0	95.0	0.892
549	P.S. 308 CLARA CARDWELL	44.0	96.0	0.828
257	P.S./M.S. 004 CROTONA PARK WEST	44.0	98.0	0.903
767	MOTT HALL IV	44.0	98.0	0.771
900	P.S. 183 DR. RICHARD R. GREEN	43.0	87.0	0.839
277	J.H.S. 145 ARTURO TOSCANINI	43.0	98.0	0.888
185	M.S. 328 - MANHATTAN MIDDLE SCHOOL FOR SCIENTI...	43.0	97.0	0.834
753	P.S./ I.S. 155 NICHOLAS HERKIMER	43.0	96.0	0.877
751	P.S./I.S. 137 RACHEL JEAN MITCHELL	43.0	94.0	0.807
649	HIGHLAND PARK COMMUNITY SCHOOL	43.0	96.0	0.818
265	P.S. 058	43.0	97.0	0.910
762	BROWNSVILLE COLLABORATIVE MIDDLE SCHOOL	42.0	96.0	0.925
293	I.S. 313 SCHOOL OF LEADERSHIP DEVELOPMENT	42.0	99.0	0.881
750	P.S. 041 FRANCIS WHITE	42.0	95.0	0.900
77	P.S. 076 A. PHILIP RANDOLPH	42.0	96.0	0.850
146	URBAN ASSEMBLY ACADEMY FOR FUTURE LEADERS	42.0	98.0	0.817
440	URBAN ASSEMBLY SCHOOL FOR WILDLIFE CONSERVATION	42.0	96.0	0.809
552	M.S. 584	41.0	97.0	0.885
626	J.H.S. 218 JAMES P. SINNOTT	41.0	79.0	0.857

	School Name	Economic Need Index	Percent Black / Hispanic
298	NEW MILLENNIUM BUSINESS ACADEMY MIDDLE SCHOOL	0.938	98.0
283	I.S. 219 NEW VENTURE SCHOOL	0.929	98.0
1247	MOTT HAVEN ACADEMY CHARTER SCHOOL	0.928	98.0
762	BROWNSVILLE COLLABORATIVE MIDDLE SCHOOL	0.925	96.0
300	NEW DIRECTIONS SECONDARY SCHOOL	0.920	98.0
346	P.S. 279 CAPTAIN MANUEL RIVERA, JR.	0.918	96.0
11	P.S. 188 THE ISLAND SCHOOL	0.917	93.0
299	I.S. 339	0.913	98.0
714	P.S. 288 THE SHIRLEY TANYHILL	0.912	90.0
363	M.S. 390	0.911	98.0

	DBN	School name	Year of SHST	Grade level	Enrollment on 10/31	Number of students who registered for the SHSAT	Number of students who took the SHSAT	Took Percentage
0	05M046	P.S. 046 Arthur Tappan	8058	32	315	98	39	1.597034
1	05M123	P.S. 123 Mahalia Jackson	8058	32	241	84	37	1.700780
2	05M129	P.S. 129 John H. Finley	8058	32	252	108	48	2.396452
3	05M148	Eagle Academy for Young Men of Harlem	6047	25	222	12	11	1.857143
4	05M157	The Urban Assembly School for Global Commerce	8058	36	309	7	3	1.333333

	DBN	School name	Enrollment on 10/31	Number of students who registered for the SHSAT	Number of students who took the SHSAT	Year of SHST	Grade level
22	84M336	KIPP Infinity Charter School	494	296	65	2016	8
29	84M726	KIPP STAR College Prep Charter School	292	292	96	2015	8
10	05M362	Columbia Secondary School	794	253	179	2015	8
20	84M065	Democracy Prep Endurance Charter School	567	194	94	2014	8
24	84M350	Democracy Prep Charter School	943	145	100	2014	8

	DBN	School name	Enrollment on 10/31	Number of students who registered for the SHSAT	Number of students who took the SHSAT	Year of SHST	Grade level	Took Percentage
22	84M336	KIPP Infinity Charter School	494	296	65	2016	8	0.219595
6	05M286	I.S. M286 Renaissance Leadership Academy	165	109	28	2015	8	0.256881
7	05M286	Urban Assembly Academy for Future Leaders	93	90	25	2016	8	0.277778
12	05M367	Academy for Social Action: A College Board School	254	14	4	2015	8	0.285714
29	84M726	KIPP STAR College Prep Charter School	292	292	96	2015	8	0.328767
9	05M304	Mott Hall High School	504	3	1	2014	8	0.333333
0	05M046	P.S. 046 Arthur Tappan	315	98	39	2013	8	0.397959

	Adjusted Grade	New?	Other Location Code in LCGMS	School Name	SED Code	Location Code	District	Latitude	Longitude	Address (Full)	City	Zip	Grades	Grade Low	Grade High	Community School?	Economic Need Index	School Income Estimate	Percent ELL	Percent Asian	Percent Black	Percent Hispanic	Percent Black / Hispanic	Percent White	Student Attendance Rate	Percent of Students Chronically Absent	Rigorous Instruction %	Rigorous Instruction Rating	Collaborative Teachers %	Collaborative Teachers Rating	Supportive Environment %	Supportive Environment Rating	Effective School Leadership %	Effective School Leadership Rating	Strong Family-Community Ties %	Strong Family-Community Ties Rating	Trust %	Trust Rating	Student Achievement Rating	Average ELA Proficiency	Average Math Proficiency	Grade 3 ELA - All Students Tested	Grade 3 ELA 4s - All Students	Grade 3 ELA 4s - Black or African American	Grade 3 ELA 4s - Hispanic or Latino	Grade 3 ELA 4s - Asian or Pacific Islander	Grade 3 ELA 4s - Economically Disadvantaged	Grade 3 Math - All Students tested	Grade 3 Math 4s - All Students	Grade 3 Math 4s - Black or African American	Grade 3 Math 4s - Hispanic or Latino	Grade 3 Math 4s - Asian or Pacific Islander	Grade 3 Math 4s - Economically Disadvantaged	Grade 4 ELA - All Students Tested	Grade 4 ELA 4s - All Students	Grade 4 ELA 4s - Black or African American	Grade 4 ELA 4s - Hispanic or Latino	Grade 4 ELA 4s - Asian or Pacific Islander	Grade 4 ELA 4s - White	Grade 4 ELA 4s - Economically Disadvantaged	Grade 4 Math - All Students Tested	Grade 4 Math 4s - All Students	Grade 4 Math 4s - Black or African American	Grade 4 Math 4s - Hispanic or Latino	Grade 4 Math 4s - Asian or Pacific Islander	Grade 4 Math 4s - White	Grade 4 Math 4s - Economically Disadvantaged	Grade 5 ELA - All Students Tested	Grade 5 ELA 4s - All Students	Grade 5 ELA 4s - Black or African American	Grade 5 ELA 4s - Hispanic or Latino	Grade 5 ELA 4s - Asian or Pacific Islander	Grade 5 ELA 4s - Economically Disadvantaged	Grade 5 Math - All Students Tested	Grade 5 Math 4s - All Students	Grade 5 Math 4s - Hispanic or Latino	Grade 5 Math 4s - Asian or Pacific Islander	Grade 5 Math 4s - Limited English Proficient	Grade 5 Math 4s - Economically Disadvantaged	Grade 6 ELA - All Students Tested	Grade 6 ELA 4s - All Students	Grade 6 ELA 4s - Hispanic or Latino	Grade 6 ELA 4s - Economically Disadvantaged	Grade 6 Math - All Students Tested	Grade 6 Math 4s - All Students	Grade 6 Math 4s - Economically Disadvantaged	Grade 7 ELA - All Students Tested	Grade 7 ELA 4s - All Students	Grade 7 ELA 4s - Hispanic or Latino	Grade 7 Math - All Students Tested	Grade 7 Math 4s - All Students	Grade 7 Math 4s - Hispanic or Latino	Grade 8 ELA - All Students Tested	Grade 8 ELA 4s - All Students	Grade 8 Math - All Students Tested	Grade 8 Math 4s - All Students
0	NaN	NaN	NaN	P.S. 015 ROBERTO CLEMENTE	310100010015	01M015	1	40.721834	-73.978766	333 E 4TH ST NEW YORK, NY 10009	NEW YORK	10009	PK,0K,01,02,03,04,05	PK	05	Yes	0.919	$31,141.72	9%	5%	32%	60%	92%	1%	94%	18%	89%	Meeting Target	94%	Meeting Target	86%	Exceeding Target	91%	Exceeding Target	85%	Meeting Target	94%	Exceeding Target	Approaching Target	2.14	2.17	20	0	0	0	0	0	21	0	0	0	0	0	15	0	0	0	0	0	0	15	2	0	0	0	0	0	16	0	0	0	0	0	16	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
1	NaN	NaN	NaN	P.S. 019 ASHER LEVY	310100010019	01M019	1	40.729892	-73.984231	185 1ST AVE NEW YORK, NY 10003	NEW YORK	10003	PK,0K,01,02,03,04,05	PK	05	No	0.641	$56,462.88	5%	10%	20%	63%	83%	6%	92%	30%	96%	NaN	96%	NaN	97%	NaN	90%	Exceeding Target	86%	Meeting Target	94%	Meeting Target	Exceeding Target	2.63	2.98	33	2	1	1	0	0	33	6	2	1	0	4	29	5	0	3	0	0	3	28	10	0	6	0	0	8	32	7	3	1	2	6	32	4	1	2	0	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	NaN	NaN	NaN	P.S. 020 ANNA SILVER	310100010020	01M020	1	40.721274	-73.986315	166 ESSEX ST NEW YORK, NY 10002	NEW YORK	10002	PK,0K,01,02,03,04,05	PK	05	No	0.744	$44,342.61	15%	35%	8%	49%	57%	4%	94%	20%	87%	Meeting Target	77%	Meeting Target	82%	Approaching Target	61%	Not Meeting Target	80%	Approaching Target	79%	Not Meeting Target	Approaching Target	2.39	2.54	76	6	0	0	4	2	76	11	0	3	7	6	70	9	0	1	6	2	1	71	13	0	0	11	2	4	73	2	0	1	1	0	73	10	1	9	1	6	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
3	NaN	NaN	NaN	P.S. 034 FRANKLIN D. ROOSEVELT	310100010034	01M034	1	40.726147	-73.975043	730 E 12TH ST NEW YORK, NY 10009	NEW YORK	10009	PK,0K,01,02,03,04,05,06,07,08	PK	08	No	0.860	$31,454.00	7%	5%	29%	63%	92%	4%	92%	28%	85%	Approaching Target	78%	Meeting Target	82%	Meeting Target	73%	Approaching Target	89%	Meeting Target	88%	Meeting Target	Exceeding Target	2.48	2.47	27	0	0	0	0	0	29	4	0	2	0	0	35	1	0	1	0	0	0	34	1	0	1	0	0	0	29	0	0	0	0	0	29	1	1	0	0	0	54	3	1	3	54	3	3	55	4	3	55	3	3	47	1	48	1
4	NaN	NaN	NaN	THE STAR ACADEMY - P.S.63	310100010063	01M063	1	40.724404	-73.986360	121 E 3RD ST NEW YORK, NY 10009	NEW YORK	10009	PK,0K,01,02,03,04,05	PK	05	No	0.730	$46,435.59	3%	4%	20%	65%	84%	10%	93%	23%	90%	Meeting Target	88%	Meeting Target	87%	Meeting Target	81%	Meeting Target	89%	Meeting Target	93%	Meeting Target	Meeting Target	2.38	2.54	21	2	0	2	0	0	21	5	0	2	0	2	15	2	1	0	0	0	0	15	3	1	0	0	0	0	12	1	0	0	0	1	12	2	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

	School Name	Rigorous Instruction %	Collaborative Teachers %	Supportive Environment %	Effective School Leadership %	Strong Family-Community Ties %	Trust %
3	P.S. 034 FRANKLIN D. ROOSEVELT	85.0	78.0	82.0	73.0	89.0	88.0
8	P.S. 140 NATHAN STRAUS	94.0	91.0	85.0	87.0	83.0	93.0
10	P.S. 184M SHUANG WEN	90.0	81.0	91.0	67.0	83.0	85.0
11	P.S. 188 THE ISLAND SCHOOL	100.0	100.0	99.0	99.0	92.0	99.0
12	ORCHARD COLLEGIATE ACADEMY	72.0	77.0	77.0	72.0	76.0	87.0

	Rigorous Instruction %	Collaborative Teachers %	Supportive Environment %	Effective School Leadership %	Strong Family-Community Ties %	Trust %
Rigorous Instruction %	1.000000	0.757568	0.720514	0.643093	0.540730	0.708931
Collaborative Teachers %	0.757568	1.000000	0.594114	0.915435	0.485351	0.882959
Supportive Environment %	0.720514	0.594114	1.000000	0.488354	0.581007	0.714141
Effective School Leadership %	0.643093	0.915435	0.488354	1.000000	0.444356	0.853213
Strong Family-Community Ties %	0.540730	0.485351	0.581007	0.444356	1.000000	0.588877
Trust %	0.708931	0.882959	0.714141	0.853213	0.588877	1.000000

	Rigorous Instruction %	Supportive Environment %	PCA Combined Feature	Strong Family-Community Ties %	SPI
3	85.0	82.0	78.859046	89.0	249.701332
8	94.0	85.0	90.408101	83.0	264.985671
10	90.0	91.0	76.608591	83.0	258.126014
11	100.0	99.0	100.000000	92.0	295.000000
12	72.0	77.0	77.852373	76.0	227.096661